{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Results for predictions in-sample data:\n",
      "tp = 68.0\tfn = 6.0\n",
      "fp = 7.0\ttn = 57.0\n",
      "\n",
      "Results for predictions out-of-sample data:\n",
      "tp = 28.0\tfn = 9.0\n",
      "fp = 9.0\ttn = 24.0\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import urllib2\n",
    "import numpy\n",
    "import random\n",
    "from sklearn import datasets, linear_model\n",
    "\n",
    "\n",
    "def confusionMatrix(predicted, actual, threshold):\n",
    "    if len(predicted) != len(actual): return -1\n",
    "    tp = 0.0\n",
    "    fp = 0.0\n",
    "    tn = 0.0\n",
    "    fn = 0.0\n",
    "    for i in range(len(actual)):\n",
    "        if actual[i] > 0.5: #labels that are 1.0  (positive examples)\n",
    "            if predicted[i] > threshold:\n",
    "                tp += 1.0 #correctly predicted positive\n",
    "            else:\n",
    "                fn += 1.0 #incorrectly predicted negative\n",
    "        else:              #labels that are 0.0 (negative examples)\n",
    "            if predicted[i] < threshold:\n",
    "                tn += 1.0 #correctly predicted negative\n",
    "            else:\n",
    "                fp += 1.0 #incorrectly predicted positive\n",
    "    rtn = [tp, fn, fp, tn]\n",
    "    return rtn\n",
    "\n",
    "\n",
    "#read data from uci data repository\n",
    "target_url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data\"\n",
    "data = urllib2.urlopen(target_url)\n",
    "\n",
    "#arrange data into list for labels and list of lists for attributes\n",
    "xList = []\n",
    "labels = []\n",
    "for line in data:\n",
    "    #split on comma\n",
    "    row = line.strip().split(\",\")\n",
    "    #assign label 1.0 for \"M\" and 0.0 for \"R\"\n",
    "    if(row[-1] == 'M'):\n",
    "        labels.append(1.0)\n",
    "    else:\n",
    "        labels.append(0.0)\n",
    "    #remove label from row\n",
    "    row.pop()\n",
    "    #convert row to floats\n",
    "    floatRow = [float(num) for num in row]\n",
    "    xList.append(floatRow)\n",
    "\n",
    "#divide attribute matrix and label vector into training(2/3 of data) and test sets (1/3 of data)\n",
    "indices = range(len(xList))\n",
    "xListTest = [xList[i] for i in indices if i%3 == 0 ]\n",
    "xListTrain = [xList[i] for i in indices if i%3 != 0 ]\n",
    "labelsTest = [labels[i] for i in indices if i%3 == 0]\n",
    "labelsTrain = [labels[i] for i in indices if i%3 != 0]\n",
    "\n",
    "#form list of list input into numpy arrays to match input class for scikit-learn linear model\n",
    "xTrain = numpy.array(xListTrain)\n",
    "yTrain = numpy.array(labelsTrain)\n",
    "xTest = numpy.array(xListTest)\n",
    "yTest = numpy.array(labelsTest)\n",
    "\n",
    "#train linear regression model\n",
    "rocksVMinesModel = linear_model.LinearRegression()\n",
    "rocksVMinesModel.fit(xTrain,yTrain)\n",
    "\n",
    "#generate predictions on in-sample error\n",
    "trainingPredictions = rocksVMinesModel.predict(xTrain)\n",
    "\n",
    "#generate confusion matrix for predictions on training set (in-sample)\n",
    "#The optimal threshold will be reached, when the accuracy of the model is the highest. Otherwise you can also use the ROC curve\n",
    "confusionMatTrain = confusionMatrix(trainingPredictions, yTrain, 0.5)\n",
    "\n",
    "#pick threshold value and generate confusion matrix entries\n",
    "tp = confusionMatTrain[0]; fn = confusionMatTrain[1]; fp = confusionMatTrain[2]; tn = confusionMatTrain[3]\n",
    "\n",
    "print(\"Results for predictions in-sample data:\")\n",
    "print(\"tp = \" + str(tp) + \"\\tfn = \" + str(fn) + \"\\n\" + \"fp = \" + str(fp) + \"\\ttn = \" + str(tn) + '\\n')\n",
    "\n",
    "#generate predictions on out-of-sample data\n",
    "testPredictions = rocksVMinesModel.predict(xTest)\n",
    "\n",
    "#generate confusion matrix from predictions on out-of-sample data\n",
    "conMatTest = confusionMatrix(testPredictions, yTest, 0.5)\n",
    "\n",
    "#pick threshold value and generate confusion matrix entries\n",
    "tp = conMatTest[0]; fn = conMatTest[1]; fp = conMatTest[2]; tn = conMatTest[3]\n",
    "\n",
    "print(\"Results for predictions out-of-sample data:\")\n",
    "print(\"tp = \" + str(tp) + \"\\tfn = \" + str(fn) + \"\\n\" + \"fp = \" + str(fp) + \"\\ttn = \" + str(tn) + '\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
